{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.2"
    },
    "colab": {
      "name": "ProtBert-BFD.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "ca117bc5582d457bb28af0a1359b460a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_238791ab5cee4f738d064f57be317113",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_03d6b4a331104237964d19f9f2622ee7",
              "IPY_MODEL_a7e0e19922984ec49d758da0cb0541b4"
            ]
          }
        },
        "238791ab5cee4f738d064f57be317113": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "03d6b4a331104237964d19f9f2622ee7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_293f2f5261c642f3ba3fe8b95d0d36b3",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 313,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 313,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_c8d5cd6b7e9e4bf78964aee7dde5576d"
          }
        },
        "a7e0e19922984ec49d758da0cb0541b4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_26aceb4d067d46edaa92bdab74aaccf8",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 313/313 [00:00&lt;00:00, 742B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b3278362084948a3bb403160d0327a71"
          }
        },
        "293f2f5261c642f3ba3fe8b95d0d36b3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "c8d5cd6b7e9e4bf78964aee7dde5576d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "26aceb4d067d46edaa92bdab74aaccf8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b3278362084948a3bb403160d0327a71": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "81f32c9ae9fa4c978dfd8ec6525c66e0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_850bc0a3a94d4c2294cb0465aea84ac1",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_2b43f800dd44414f9fd419e912145bf6",
              "IPY_MODEL_510e072b0a08458793a6f92864a91352"
            ]
          }
        },
        "850bc0a3a94d4c2294cb0465aea84ac1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "2b43f800dd44414f9fd419e912145bf6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_57dfd80181d24b07a42c31fd7d5bcbe5",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 81,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 81,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_8068df51dd6d4bbcaab41032a8f00025"
          }
        },
        "510e072b0a08458793a6f92864a91352": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_85c6a76a58dc48189397d79ad6f54663",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 81.0/81.0 [00:25&lt;00:00, 3.18B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_5fec21026886492081bab799978cc64e"
          }
        },
        "57dfd80181d24b07a42c31fd7d5bcbe5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "8068df51dd6d4bbcaab41032a8f00025": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "85c6a76a58dc48189397d79ad6f54663": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "5fec21026886492081bab799978cc64e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "82c618c039ea4f7dbf9ebf67bb6d5dee": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_436b663b4efa4c58848ff9ff5e323e4a",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_d8452bd1a385473c8e7258ae8360fb89",
              "IPY_MODEL_827c44e3216b423786dfa537bf06c69f"
            ]
          }
        },
        "436b663b4efa4c58848ff9ff5e323e4a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "d8452bd1a385473c8e7258ae8360fb89": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_2bb13cee19b6441f9b7b180e2e47a04b",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 1684058277,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 1684058277,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_00f9acbdd3854649aaacbab0e81bf8e1"
          }
        },
        "827c44e3216b423786dfa537bf06c69f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_7e550ce8699743408b22536fa4b2cb3f",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 1.68G/1.68G [00:23&lt;00:00, 73.2MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b3b5d0bd1ccf468989fa5f8151933639"
          }
        },
        "2bb13cee19b6441f9b7b180e2e47a04b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "00f9acbdd3854649aaacbab0e81bf8e1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "7e550ce8699743408b22536fa4b2cb3f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b3b5d0bd1ccf468989fa5f8151933639": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Basic/ProtBert-BFD.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fXFGYhQo-h2F",
        "colab_type": "text"
      },
      "source": [
        "<h3> Extracting protein sequences' features using ProtBert-BFD pretrained-model <h3>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eRKqEJ5L-h2H",
        "colab_type": "text"
      },
      "source": [
        "<b>1. Load necessry libraries including huggingface transformers<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GXAKFATm-mbs",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        },
        "outputId": "b6bf5b6a-2c63-42f0-d7d9-c9e5b6fd69e4"
      },
      "source": [
        "!pip install -q transformers"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[K     |████████████████████████████████| 778kB 8.0MB/s \n",
            "\u001b[K     |████████████████████████████████| 1.1MB 13.0MB/s \n",
            "\u001b[K     |████████████████████████████████| 3.0MB 45.9MB/s \n",
            "\u001b[K     |████████████████████████████████| 890kB 64.5MB/s \n",
            "\u001b[?25h  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jd3YQUd1-h2I",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import torch\n",
        "from transformers import AutoTokenizer, AutoModel, pipeline\n",
        "import re\n",
        "import numpy as np\n",
        "import os\n",
        "import requests\n",
        "from tqdm.auto import tqdm"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hAKCMu_2-h2V",
        "colab_type": "text"
      },
      "source": [
        "<b>2. Load the vocabulary and ProtBert-BFD Model<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HS8i5sOJ-h2W",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 115,
          "referenced_widgets": [
            "ca117bc5582d457bb28af0a1359b460a",
            "238791ab5cee4f738d064f57be317113",
            "03d6b4a331104237964d19f9f2622ee7",
            "a7e0e19922984ec49d758da0cb0541b4",
            "293f2f5261c642f3ba3fe8b95d0d36b3",
            "c8d5cd6b7e9e4bf78964aee7dde5576d",
            "26aceb4d067d46edaa92bdab74aaccf8",
            "b3278362084948a3bb403160d0327a71",
            "81f32c9ae9fa4c978dfd8ec6525c66e0",
            "850bc0a3a94d4c2294cb0465aea84ac1",
            "2b43f800dd44414f9fd419e912145bf6",
            "510e072b0a08458793a6f92864a91352",
            "57dfd80181d24b07a42c31fd7d5bcbe5",
            "8068df51dd6d4bbcaab41032a8f00025",
            "85c6a76a58dc48189397d79ad6f54663",
            "5fec21026886492081bab799978cc64e"
          ]
        },
        "outputId": "c1ad9960-054d-41b4-825b-317c19dcb094"
      },
      "source": [
        "tokenizer = AutoTokenizer.from_pretrained(\"Rostlab/prot_bert_bfd\", do_lower_case=False )"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "ca117bc5582d457bb28af0a1359b460a",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=313.0, style=ProgressStyle(description_…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "81f32c9ae9fa4c978dfd8ec6525c66e0",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=81.0, style=ProgressStyle(description_w…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ERtkR05t-h2c",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "82c618c039ea4f7dbf9ebf67bb6d5dee",
            "436b663b4efa4c58848ff9ff5e323e4a",
            "d8452bd1a385473c8e7258ae8360fb89",
            "827c44e3216b423786dfa537bf06c69f",
            "2bb13cee19b6441f9b7b180e2e47a04b",
            "00f9acbdd3854649aaacbab0e81bf8e1",
            "7e550ce8699743408b22536fa4b2cb3f",
            "b3b5d0bd1ccf468989fa5f8151933639"
          ]
        },
        "outputId": "7ce59662-54e5-4e08-f907-529adf65c3b7"
      },
      "source": [
        "model = AutoModel.from_pretrained(\"Rostlab/prot_bert_bfd\")"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "82c618c039ea4f7dbf9ebf67bb6d5dee",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1684058277.0, style=ProgressStyle(descr…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XDnhEiyI-h2g",
        "colab_type": "text"
      },
      "source": [
        "<b>3. Load the model into the GPU if avilabile<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KNweYMPQ-h2h",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer,device=0 )"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AWbZG3wu-h2l",
        "colab_type": "text"
      },
      "source": [
        "<b>4. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2kqLvaxa-h2l",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sequences_Example = [\"A E T C Z A O\",\"S K T Z P\"]"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rLxGKOwe-h2r",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sequences_Example = [re.sub(r\"[UZOB]\", \"X\", sequence) for sequence in sequences_Example]"
      ],
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jfk9vkyV-h2x",
        "colab_type": "text"
      },
      "source": [
        "<b>5. Extracting sequences' features and covert the output to numpy if needed<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gZPpLMK2-h2y",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "embedding = fe(sequences_Example)"
      ],
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eRsQnB8Y-h24",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "embedding = np.array(embedding)"
      ],
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FIvh0j8e-h26",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 476
        },
        "outputId": "b28def7a-4e27-4a5c-c7e2-735a03a47c1b"
      },
      "source": [
        "print(embedding)"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[[[ 0.07873335  0.02530318  0.10022405 ... -0.11921909 -0.08556543\n",
            "    0.0157964 ]\n",
            "  [ 0.05551133 -0.10461304 -0.03253962 ...  0.05091606  0.04318975\n",
            "    0.10181108]\n",
            "  [ 0.13895561 -0.046583    0.02193631 ...  0.06942613  0.14762992\n",
            "    0.06503808]\n",
            "  ...\n",
            "  [ 0.08129995 -0.1092955  -0.03022903 ...  0.08717731  0.02061446\n",
            "    0.05156654]\n",
            "  [ 0.06197417 -0.06417818 -0.02039655 ... -0.02796507  0.0884005\n",
            "    0.07532689]\n",
            "  [ 0.09792715 -0.02395107 -0.07244891 ... -0.02299493  0.03639549\n",
            "    0.0507541 ]]\n",
            "\n",
            " [[ 0.11757391 -0.01604314  0.12586999 ... -0.0931114  -0.09125263\n",
            "   -0.01659234]\n",
            "  [-0.06304268 -0.23687428 -0.07115868 ... -0.03852162 -0.00322069\n",
            "   -0.05244054]\n",
            "  [ 0.01905588 -0.105173   -0.02930211 ... -0.00238627 -0.09289714\n",
            "    0.02722595]\n",
            "  ...\n",
            "  [-0.00986713 -0.15825894 -0.08714887 ... -0.10954074 -0.03796846\n",
            "   -0.03043361]\n",
            "  [-0.01160221 -0.16065687 -0.11397075 ... -0.07776542 -0.00063999\n",
            "   -0.02496129]\n",
            "  [ 0.002604   -0.1685513  -0.06161089 ... -0.09357076 -0.0614622\n",
            "   -0.02907946]]]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AuRYxkbO-h29",
        "colab_type": "text"
      },
      "source": [
        "<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtBert-BFD model<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HUL5nEBy-h2-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "features = [] \n",
        "\n",
        "for seq_num in range(len(embedding)):\n",
        "    seq_len = len(sequences_Example[seq_num].replace(\" \", \"\"))\n",
        "    start_Idx = 1\n",
        "    end_Idx = seq_len+1\n",
        "    seq_emd = embedding[seq_num][start_Idx:end_Idx]\n",
        "    features.append(seq_emd)"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1-sP8qGp-h3B",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 391
        },
        "outputId": "bdd63f82-1b45-462b-c6ed-a4d8d621a3b4"
      },
      "source": [
        "print(features)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[array([[ 0.05551133, -0.10461304, -0.03253962, ...,  0.05091606,\n",
            "         0.04318975,  0.10181108],\n",
            "       [ 0.13895561, -0.046583  ,  0.02193631, ...,  0.06942613,\n",
            "         0.14762992,  0.06503808],\n",
            "       [ 0.14610603, -0.08092842, -0.12500416, ..., -0.03651231,\n",
            "         0.02485525,  0.07977536],\n",
            "       ...,\n",
            "       [ 0.02349902, -0.01549769, -0.05685329, ..., -0.01342281,\n",
            "         0.01704315,  0.06431052],\n",
            "       [ 0.08129995, -0.1092955 , -0.03022903, ...,  0.08717731,\n",
            "         0.02061446,  0.05156654],\n",
            "       [ 0.06197417, -0.06417818, -0.02039655, ..., -0.02796507,\n",
            "         0.0884005 ,  0.07532689]]), array([[-0.06304268, -0.23687428, -0.07115868, ..., -0.03852162,\n",
            "        -0.00322069, -0.05244054],\n",
            "       [ 0.01905588, -0.105173  , -0.02930211, ..., -0.00238627,\n",
            "        -0.09289714,  0.02722595],\n",
            "       [ 0.07721861, -0.1703198 , -0.13987812, ..., -0.08390203,\n",
            "         0.03587941, -0.01317161],\n",
            "       [ 0.00872737, -0.1771819 , -0.05856298, ..., -0.09918059,\n",
            "        -0.06392955, -0.02541981],\n",
            "       [-0.03867153, -0.17455773, -0.08231924, ...,  0.00600081,\n",
            "        -0.00368144, -0.13687409]])]\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}